import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
df = pd.read_csv('data/college_financials.csv', header=0)
yr4 = df#.query('sector in (1,2,3)')
len(yr4)
pd.value_counts(yr4['academic_year']).sort_index().plot.bar()
yr4['namecity'] = yr4['inst_name'] + yr4['city'] + yr4['state']
schoolrecords = pd.value_counts(yr4['namecity'])
pd.value_counts(schoolrecords).sort_index().plot.bar()
joined = yr4.join(schoolrecords, on='namecity', lsuffix='yr4', rsuffix='cnt')
final_df = joined[joined['namecitycnt'] == 13].drop('namecitycnt', axis=1)
len(final_df)
for c in final_df.columns:
if final_df[c].dtype in (object, np.int64):
print('skipping {}'.format(c))
continue
if '_pct' not in c and 'index' not in c and 'scalar' not in c:
# ajust for inflation
print('adjusting {} for inflation'.format(c))
final_df[c] = final_df[c] / df['hepi_scalar_2012']
final_df['{}_prev'.format(c)] = final_df.groupby('namecityyr4')[c].shift(1)
final_df['{}_change'.format(c)] = (final_df[c] - final_df['{}_prev'.format(c)]) / final_df['{}_prev'.format(c)]
final_df[final_df['inst_name'] == 'Stanford University']
final_df.to_csv('data/college_financials_only13.csv')
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
for c in final_df.columns:
logged = False
if final_df[c].dtype in (object, np.int64) or '_prev' in c or '_change' in c or 'cpi' in c or 'hepi' in c or 'heca' in c:
continue
temp_df = final_df[['{}_prev'.format(c), '{}_change'.format(c), 'academic_year', 'sector']].copy()
if '_pct' not in c:
temp_df['{}_prev'.format(c)] = temp_df['{}_prev'.format(c)].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
logged = True
lm = sns.lmplot('{}_prev'.format(c),
'{}_change'.format(c),
data = temp_df,
fit_reg=False,
hue = 'academic_year', col='sector', sharex=False)
lm.axes[0,0].set_ylim([-1,2])
lm.axes[0,1].set_ylim([-1,2])
lm.axes[0,2].set_ylim([-1,2])
if '_pct' in c:
lm.axes[0,0].set_xlim([0,100])
lm.axes[0,1].set_xlim([0,100])
lm.axes[0,2].set_xlim([0,100])
for c in final_df.columns:
logged = False
if final_df[c].dtype in (object, np.int64) or '_prev' in c or '_change' in c or 'cpi' in c or 'hepi' in c or 'heca' in c:
continue
temp_df = final_df[['{}_prev'.format(c), c, 'academic_year', 'sector']].copy()
if '_pct' not in c:
temp_df['{}_prev'.format(c)] = temp_df['{}_prev'.format(c)].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
temp_df[c] = temp_df[c].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
logged = True
lm = sns.lmplot('{}_prev'.format(c),c,
data = temp_df,
fit_reg = False,
hue = 'academic_year', col='sector', sharex=False)
if '_pct' in c:
for i in range(3):
lm.axes[0,i].set_xlim([0,100])
lm.axes[0,i].set_ylim([0,100])
xlims = lm.axes[0,0].get_xlim()
ylims = lm.axes[0,0].get_ylim()
start = np.maximum(xlims[0], ylims[0])
end = np.minimum(xlims[1], ylims[1])
for i in range(3):
lm.axes[0,i].plot([start, end], [start,end])
lm.axes[0,i].set_xlim([start,end])
lm.axes[0,i].set_ylim([start,end])
def dig_deep(col):
temp_df = final_df[['academic_year', 'sector',col, '{}_prev'.format(col)]].copy()
temp_df[col] = temp_df[col].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
temp_df['{}_prev'.format(col)] = temp_df['{}_prev'.format(col)].apply(lambda x: np.sign(x) * np.log10(x) if x != 0 else 0)
lm = sns.lmplot('{}_prev'.format(col), col, fit_reg=False, col = 'sector', row='academic_year', data=temp_df, scatter=True)
for j in range(13):
xlims = lm.axes[j,0].get_xlim()
ylims = lm.axes[j,0].get_ylim()
start = np.maximum(xlims[0], ylims[0])
end = np.minimum(xlims[1], ylims[1])
for i in range(3):
lm.axes[j,i].plot([start, end], [start,end])
lm.axes[j,i].set_xlim([start, end])
lm.axes[j,i].set_ylim([start, end])
dig_deep('investment')
dig_deep('grant02')
dig_deep('institutional_grant_aid')